视频版地址:
链接: https://pan.baidu.com/s/1GTNBCrZ5hBw4w2CYj2jMeA?pwd=73c8 提取码: 73c8
vscode
vscode插件安装:chinese、remote、python、pylance、python debugger、Python Environment Manager
大家查看群文件自己的user和密码
vscode连接:
Host 36.212.4.98
HostName 36.212.4.98
User tangou
cat /data/tools/setenv.sh >> ~/.bashrc
source ~/.bashrc
检查是否运行成功
conda info --envs #查看conda环境
ollama list # 查看ollama有哪些模型
ollama run bsahane/Qwen2.5-VL-7B-Instruct:Q4_K_M_benxh # 运行ollama交互式,ctrl d 取消
# http://127.0.0.1:18099
source /data/tools/setproxy.sh #启动vpn
source /data/tools/unsetproxy.sh #关闭vpn
cd /home/tangou/tangou2 #你自己的路径
cp -r /data/3e/share/* /home/tangou/tangou2/
copy之后
# 开vpn
source /data/tools/setproxy.sh
# tg10 换成自己的名字
conda create -n tg10 python=3.10.16
# 切换环境
conda activate tg10
# 如果重新打开终端,没启动。请启动下,开vpn。
source /data/tools/setproxy.sh
# 安装包,第一次跑没缓存,运行时间会很久,在下数据包
pip install -r requirements.txt
# 额外安装这个包,pip源没有
pip install EETQ-1.0.1-cp310-cp310-linux_x86_64.whl
首先回到连接data共享目录,连vscode
复制路径,我们这里微调32B
回到原来的vscode,将上面复制的model路径放进来
打开终端
# 如果重新打开终端,没启动。请启动下,开vpn。
source /data/tools/setproxy.sh
# 切换你的python环境
conda activate tg10
# 训练
NCCL_P2P_LEVEL=NVL HUGGINGFACE_HUB_CACHE="/data/huggingface/hub" FORCE_TORCHRUN=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 llamafactory-cli train qwen2.5vl_lora_sft_3.yaml
# 查看运行记录,swanlog是相对路径,如果端口被占用,则--port xxx
conda activate tg10
swanlab watch swanlog --port 5092
本地浏览器访问:http://127.0.0.1:5092
source /data/tools/setproxy.sh
conda activate tg10
# 如果端口占用,请换个端口
export GRADIO_SERVER_PORT=7860
NCCL_P2P_LEVEL=NVL HUGGINGFACE_HUB_CACHE="/data/huggingface/hub" FORCE_TORCHRUN=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 llamafactory-cli webchat qwen2.5vl_lora_sft_3_inference.yaml
本地浏览器访问:http://0.0.0.0:7860
拿刚刚训练的数据来测
下载到本地
推理(这个图片在原本的模型上就一个训练过)
source /data/tools/setproxy.sh
conda activate tg10
NCCL_P2P_LEVEL=NVL HUGGINGFACE_HUB_CACHE="/data/huggingface/hub" FORCE_TORCHRUN=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 llamafactory-cli train qwen2.5vl_lora_sft_3_evaluation.yaml
webui运行
source /data/tools/setproxy.sh
conda activate tg10
# 如果端口占用请换个端口
export GRADIO_SERVER_PORT=7860
llamafactory-cli webui
下载模型、数据集
source /data/tools/setproxy.sh
conda activate tg10
huggingface-cli login # token教程:https://blog.csdn.net/m0_52625549/article/details/134255660
----
export HUGGINGFACE_HUB_CACHE="/data/huggingface/hub" #设置缓存路径,就是之前的共享目录
# 数据集
huggingface-cli download --resume-download --repo-type dataset llamafactory/RLHF-V --local-dir-use-symlinks False
# 模型
huggingface-cli download --resume-download Qwen/Qwen2.5-VL-7B-Instruct --local-dir-use-symlinks False
llamafactory-cli export qwen2.5vl_lora_sft_3_export.yaml
导出完成
ollama create qwen2_5_vl_custom -f /home/tangou/tangou2/results/export/qwen2.5_vl_lora_sft_dev/Modelfile
ollama run qwen2_5_vl_custom
评估
source /data/tools/setproxy.sh
conda activate tg10
NCCL_P2P_LEVEL=NVL HUGGINGFACE_HUB_CACHE="/data/huggingface/hub" FORCE_TORCHRUN=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 llamafactory-cli train qwen2.5vl_lora_sft_3_evaluation_no_lora.yaml
爆显存
32B模型微调后,8bit量化爆显存,8卡每张卡占40G左右,4bit量化每张卡占20G左右,如下结果
修改qwen2.5vl_lora_sft_3_evaluation.yaml的量化,bit,为4和8,如果不需要就把其全部注释
source /data/tools/setproxy.sh
conda activate tg10
NCCL_P2P_LEVEL=NVL HUGGINGFACE_HUB_CACHE="/data/huggingface/hub" FORCE_TORCHRUN=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 llamafactory-cli train qwen2.5vl_lora_sft_3_evaluation.yaml
4bit评估结果
数据集coco_2014_caption,部分
60,30,10
10 epoch
conda activate tg10
# 7B,微调,per_device_train_batch_size: 8, 8卡每张显存占用 28-35G, 1 epoch/30s
NCCL_P2P_LEVEL=NVL HUGGINGFACE_HUB_CACHE="/data/huggingface/hub" FORCE_TORCHRUN=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 llamafactory-cli train qwen2.5vl_lora_sft_7B_train.yaml
# 7B微调前,评估,per_device_train_batch_size: 4, 8卡每张显存占用 26G
NCCL_P2P_LEVEL=NVL HUGGINGFACE_HUB_CACHE="/data/huggingface/hub" FORCE_TORCHRUN=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 llamafactory-cli train qwen2.5vl_lora_sft_7B_evaluation_before.yaml
# 7B微调后,评估,per_device_train_batch_size: 4, 8卡每张显存占用 26G
NCCL_P2P_LEVEL=NVL HUGGINGFACE_HUB_CACHE="/data/huggingface/hub" FORCE_TORCHRUN=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 llamafactory-cli train qwen2.5vl_lora_sft_7B_evaluation_after.yaml
7B微调loss(收敛不明显)
结果
// 7B微调前
{
"predict_bleu-4": 8.043468749999999,
"predict_model_preparation_time": 0.0075,
"predict_rouge-1": 18.08796875,
"predict_rouge-2": 4.3805499999999995,
"predict_rouge-l": 11.1585375,
"predict_runtime": 8.4024,
"predict_samples_per_second": 3.57,
"predict_steps_per_second": 0.119
}
// 7B微调后
{
"predict_bleu-4": 8.093843750000001,
"predict_model_preparation_time": 0.0101,
"predict_rouge-1": 18.75373125,
"predict_rouge-2": 4.851125,
"predict_rouge-l": 11.525771875,
"predict_runtime": 12.6196,
"predict_samples_per_second": 2.377,
"predict_steps_per_second": 0.079
}
conda activate tg10
# 32B,微调,per_device_train_batch_size: 6, 8卡每张显存占用 40G, 1 epoch/120s
NCCL_P2P_LEVEL=NVL HUGGINGFACE_HUB_CACHE="/data/huggingface/hub" FORCE_TORCHRUN=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 llamafactory-cli train qwen2.5vl_lora_sft_32B_train.yaml
# 32B微调前,评估,8bit量化,per_device_train_batch_size: 2, 8卡每张显存占用 38G
NCCL_P2P_LEVEL=NVL HUGGINGFACE_HUB_CACHE="/data/huggingface/hub" FORCE_TORCHRUN=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 llamafactory-cli train qwen2.5vl_lora_sft_32B_evaluation_before_8bit.yaml
# 32B微调前,评估,4bit量化,per_device_train_batch_size: 2, 8卡每张显存占用 24G
NCCL_P2P_LEVEL=NVL HUGGINGFACE_HUB_CACHE="/data/huggingface/hub" FORCE_TORCHRUN=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 llamafactory-cli train qwen2.5vl_lora_sft_32B_evaluation_before_4bit.yaml
# 32B微调后,评估,8bit量化,爆显存
NCCL_P2P_LEVEL=NVL HUGGINGFACE_HUB_CACHE="/data/huggingface/hub" FORCE_TORCHRUN=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 llamafactory-cli train qwen2.5vl_lora_sft_32B_evaluation_after_8bit.yaml
# 32B微调后,评估,4bit量化,per_device_train_batch_size: 2, 8卡每张显存占用 24G
NCCL_P2P_LEVEL=NVL HUGGINGFACE_HUB_CACHE="/data/huggingface/hub" FORCE_TORCHRUN=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 llamafactory-cli train qwen2.5vl_lora_sft_32B_evaluation_after_4bit.yaml
32B微调loss(收敛不明显)
结果
// 32B微调前,8bit
{
"predict_bleu-4": 4.050425000000001,
"predict_model_preparation_time": 0.0026,
"predict_rouge-1": 11.830003125,
"predict_rouge-2": 2.671021875,
"predict_rouge-l": 5.869315625,
"predict_runtime": 236.453,
"predict_samples_per_second": 0.127,
"predict_steps_per_second": 0.008
}
// 32B微调前,4bit
{
"predict_bleu-4": 3.9843874999999995,
"predict_model_preparation_time": 0.0026,
"predict_rouge-1": 11.7092875,
"predict_rouge-2": 2.0235125,
"predict_rouge-l": 5.531334375,
"predict_runtime": 193.3049,
"predict_samples_per_second": 0.155,
"predict_steps_per_second": 0.01
}
// 32B微调后,4bit
{
"predict_bleu-4": 4.27783125,
"predict_model_preparation_time": 0.0129,
"predict_rouge-1": 12.3642375,
"predict_rouge-2": 2.23608125,
"predict_rouge-l": 6.093603125,
"predict_runtime": 220.0491,
"predict_samples_per_second": 0.136,
"predict_steps_per_second": 0.009
}